package pl.edu.icm.coansys.commons.hadoop;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.SystemUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.ReflectionUtils;
import com.google.common.collect.Lists;
/**
* Util class for reading sequence files from local filesystem
*
* @author madryk
*
*/
public class LocalSequenceFileUtils {
//------------------------ CONSTRUCTORS --------------------------
private LocalSequenceFileUtils() {
throw new IllegalArgumentException("Can't instantiate " + LocalSequenceFileUtils.class.getName() + " class");
}
//------------------------ LOGIC --------------------------
/**
* Reads sequence file from local filesystem.
* Passed file object can be single file or a directory
* that contains sequence file splitted into parts (part-* files).
* Sequence file must contain keys and values of types
* specified as method parameters.
*/
public static <K extends Writable, V extends Writable> List<Pair<K, V>> readSequenceFile(File sequenceFile, Class<K> keyClass, Class<V> valueClass) throws IOException {
if (sequenceFile.isFile()) {
Path path = getAbsolutePath(sequenceFile);
return readSequenceFile(path, keyClass, valueClass);
}
List<Pair<K, V>> records = Lists.newArrayList();
for (File f : FileUtils.listFiles(sequenceFile, null, true)) {
if (f.isFile() && f.getName().startsWith("part-")) {
Path path = getAbsolutePath(f);
List<Pair<K, V>> singleFileRecords = readSequenceFile(path, keyClass, valueClass);
records.addAll(singleFileRecords);
}
}
return records;
}
//------------------------ PRIVATE --------------------------
private static <K extends Writable, V extends Writable> List<Pair<K, V>> readSequenceFile(Path path, Class<K> keyClass, Class<V> valueClass) throws IOException {
List<Pair<K, V>> records = Lists.newArrayList();
Configuration conf = new Configuration();
try (SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(path))) {
K key = ReflectionUtils.newInstance(keyClass, conf);
V value = ReflectionUtils.newInstance(valueClass, conf);
while (reader.next(key, value)) {
records.add(new ImmutablePair<K, V>(key, value));
key = ReflectionUtils.newInstance(keyClass, conf);
value = ReflectionUtils.newInstance(valueClass, conf);
}
}
return records;
}
private static Path getAbsolutePath(File sequenceFile) {
String pathString = sequenceFile.getAbsolutePath();
Path path = null;
if (SystemUtils.IS_OS_WINDOWS) {
path = new Path("file:///" + pathString); // hadoop utils assume that the absolute path starts with "/"
} else {
path = new Path("file://" + pathString);
}
return path;
}
}